### Data Analaysis
import numpy as np
import pandas as pd
### Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go
import plotly.express as px
%matplotlib inline
import warnings
warnings.simplefilter(action='ignore', category=Warning)
pd.set_option('display.max_columns', None)
fifa = pd.read_csv(r'E:\Coding Nest\meetup\FIFA-2019\datasets\data.csv',index_col=0)
fifa.head()
fifa.info()
fifa.drop(['ID','Photo','Flag','Club Logo','Real Face','Jersey Number','Loaned From'],
axis=1,inplace=True)
#Numerical Features
fifa['International Reputation'].fillna(fifa['International Reputation'].mean(), inplace = True)
fifa['Skill Moves'].fillna(fifa['Skill Moves'].mean(), inplace = True)
fifa['Weak Foot'].fillna(fifa['Weak Foot'].mean(), inplace = True)
fifa['Crossing'].fillna(fifa['Crossing'].mean(), inplace = True)
fifa['Finishing'].fillna(fifa['Finishing'].mean(), inplace = True)
fifa['HeadingAccuracy'].fillna(fifa['HeadingAccuracy'].mean(), inplace = True)
fifa['ShortPassing'].fillna(fifa['ShortPassing'].mean(), inplace = True)
fifa['Volleys'].fillna(fifa['Volleys'].mean(), inplace = True)
fifa['Dribbling'].fillna(fifa['Dribbling'].mean(), inplace = True)
fifa['Curve'].fillna(fifa['Curve'].mean(), inplace = True)
fifa['FKAccuracy'].fillna(fifa['FKAccuracy'].mean(), inplace = True)
fifa['LongPassing'].fillna(fifa['LongPassing'].mean(), inplace = True)
fifa['BallControl'].fillna(fifa['BallControl'].mean(), inplace = True)
fifa['Acceleration'].fillna(fifa['Acceleration'].mean(), inplace = True)
fifa['SprintSpeed'].fillna(fifa['SprintSpeed'].mean(), inplace = True)
fifa['Agility'].fillna(fifa['Agility'].mean(), inplace = True)
fifa['Reactions'].fillna(fifa['Reactions'].mean(), inplace = True)
fifa['Balance'].fillna(fifa['Balance'].mean(), inplace = True)
fifa['ShotPower'].fillna(fifa['ShotPower'].mean(), inplace = True)
fifa['Jumping'].fillna(fifa['Jumping'].mean(), inplace = True)
fifa['Stamina'].fillna(fifa['Stamina'].mean(), inplace = True)
fifa['Strength'].fillna(fifa['Strength'].mean(), inplace = True)
fifa['LongShots'].fillna(fifa['LongShots'].mean(), inplace = True)
fifa['Aggression'].fillna(fifa['Aggression'].mean(), inplace = True)
fifa['Interceptions'].fillna(fifa['Interceptions'].mean(), inplace = True)
fifa['Positioning'].fillna(fifa['Positioning'].mean(), inplace = True)
fifa['Vision'].fillna(fifa['Vision'].mean(), inplace = True)
fifa['Penalties'].fillna(fifa['Penalties'].mean(), inplace = True)
fifa['Composure'].fillna(fifa['Composure'].mean(), inplace = True)
fifa['Marking'].fillna(fifa['Marking'].mean(), inplace = True)
fifa['StandingTackle'].fillna(fifa['StandingTackle'].mean(), inplace = True)
fifa['SlidingTackle'].fillna(fifa['SlidingTackle'].mean(), inplace = True)
fifa['GKDiving'].fillna(fifa['GKDiving'].mean(), inplace = True)
fifa['GKHandling'].fillna(fifa['GKHandling'].mean(), inplace = True)
fifa['GKKicking'].fillna(fifa['GKKicking'].mean(), inplace = True)
fifa['GKPositioning'].fillna(fifa['GKPositioning'].mean(), inplace = True)
fifa['GKReflexes'].fillna(fifa['GKReflexes'].mean(), inplace = True)
# Value, Wage and Release Clause Conversion
def value_and_wage_conversion(Value):
if isinstance(Value,str):
out = Value.replace('€', '')
if 'M' in out:
out = float(out.replace('M', ''))*1000000
elif 'K' in Value:
out = float(out.replace('K', ''))*1000
return float(out)
fifa['Value'] = fifa['Value'].apply(lambda x: value_and_wage_conversion(x))
fifa['Wage'] = fifa['Wage'].apply(lambda x: value_and_wage_conversion(x))
fifa['Release Clause'] = fifa['Release Clause'].apply(lambda x: value_and_wage_conversion(x))
fifa['Release Clause'].fillna(fifa['Release Clause'].mean(), inplace = True)
#Categorical Columns
fifa['Club'].fillna('No Club', inplace = True)
fifa['Position'].fillna('unknown',inplace=True)
#Joined Column
def clean_date(x):
l=[]
if isinstance(x,str):
l=x.split(", ")
x=l[-1]
return(x)
fifa['Joined']=fifa['Joined'].replace(np.nan,0)
fifa['Joined'] = fifa['Joined'].apply(clean_date).astype('str')
#Contract Valid Until Column
import datetime
fifa['Contract Valid Until']=pd.to_datetime(fifa['Contract Valid Until'])
fifa['year'] = pd.DatetimeIndex(fifa['Contract Valid Until']).year.astype(str)
fifa['Contract Valid Until'].fillna('Not available', inplace = True)
#Height Column
def clean_height(x):
l=[]
s=1
if isinstance(x,str):
l=x.split("'")
i=int(l[0])
j=int(l[1])
s=((i*12)+j)/12
return(s)
fifa['Height'] = fifa['Height'].apply(clean_height)
fifa['Height'].fillna((fifa['Height'].mean()), inplace = True)
#Weight Column
def clean_weight(x):
if isinstance(x,str):
return(x.replace('lbs', ''))
return(x)
fifa['Weight'] = fifa['Weight'].apply(clean_weight).astype('float')
fifa['Weight'].fillna((fifa['Weight'].mean()), inplace = True)
#Work Rate Column
fifa['Work Rate'].value_counts()
fifa['Work Rate'].fillna('Medium/ Medium', inplace = True)
#Preferred Foot column
fifa['Preferred Foot'].value_counts()
fifa['Preferred Foot'].fillna('Right', inplace = True)
#Body Type Column
fifa['Body Type'].value_counts()
fifa['Body Type'][fifa['Body Type'] == 'Messi'] = 'Lean'
fifa['Body Type'][fifa['Body Type'] == 'C. Ronaldo'] = 'Normal'
fifa['Body Type'][fifa['Body Type'] == 'Neymar'] = 'Lean'
fifa['Body Type'][fifa['Body Type'] == 'Courtois'] = 'Lean'
#PLAYER_BODY_TYPE_25 is Normal body type
fifa['Body Type'][fifa['Body Type'] == 'PLAYER_BODY_TYPE_25'] = 'Normal'
fifa['Body Type'][fifa['Body Type'] == 'Shaqiri'] = 'Stocky'
fifa['Body Type'][fifa['Body Type'] == 'Akinfenwa'] = 'Stocky'
fifa['Body Type'].fillna('Normal', inplace = True)
#Skill Columns
#Function to convert skill rating at each position.
def skillConverter(val):
if type(val) == str:
s1 = val[0:2]
s2 = val[-1]
val = int(s1) + int(s2)
return val
else:
return val
skill_columns = ['LS', 'ST', 'RS', 'LW', 'LF', 'CF', 'RF', 'RW', 'LAM', 'CAM',
'RAM', 'LM', 'LCM', 'CM', 'RCM', 'RM', 'LWB', 'LDM', 'CDM', 'RDM',
'RWB', 'LB', 'LCB', 'CB', 'RCB', 'RB']
for col in skill_columns:
fifa[col] = fifa[col].apply(skillConverter)
fifa[col].fillna(0.0, inplace = True)
fifa.head()
def pie_count(data, field="Nationality", percent_limit=0.5, title="Number of players by "):
title += field
#data[field] = data[field].fillna('NA')
data = data[field].value_counts().to_frame()
total = data[field].sum()
data['percentage'] = 100 * data[field]/total
percent_limit = percent_limit
otherdata = data[data['percentage'] < percent_limit]
others = otherdata['percentage'].sum()
maindata = data[data['percentage'] >= percent_limit]
data = maindata
other_label = "Others(<" + str(percent_limit) + "% each)"
data.loc[other_label] = pd.Series({field:otherdata[field].sum()})
labels = data.index.tolist()
datavals = data[field].tolist()
trace = go.Pie(labels = labels,
values = datavals)
layout = go.Layout(
title = title,
height=700
)
fig = go.Figure(data=[trace], layout=layout)
fig.show()
pie_count(fifa, 'Nationality')
pie_count(fifa, 'Club')
pie_count(fifa, 'Preferred Foot')
pie_count(fifa, 'Work Rate', 0.1)
pie_count(fifa, 'Body Type', 0.1)
pie_count(fifa, 'Position', 0.1)
x = fifa['Age']
plt.figure(figsize=(10,8))
ax = sns.countplot(x,color='#00ffff')
ax.set_xlabel(xlabel = 'Age of the Players', fontsize = 16)
ax.set_title(label = 'Distribution of Age of the Players', fontsize = 20)
plt.show()
overall = pd.DataFrame(fifa.groupby(["Age"])['Overall'].mean())
potential = pd.DataFrame(fifa.groupby(["Age"])['Potential'].mean())
merged = pd.merge(overall, potential, on='Age', how='inner')
merged['Age']= merged.index
fig, ax = plt.subplots(figsize=(10,8))
merged.reset_index(drop = True, inplace = True)
plt.plot('Age', 'Overall', data=merged, marker='.', color='#00ffff', lw=1, label ="Overall" )
plt.plot('Age', 'Potential', data=merged, marker='+', color='#0000cc', lw=1, label = "Potential")
plt.xlabel('Overall Rating')
plt.ylabel('Average Growth Potential by Age')
plt.legend();
club = fifa.groupby('Club')['Value'].mean().reset_index().sort_values('Value', ascending=True).tail(20)
fig = px.bar(club, x="Value", y="Club", orientation='h')
fig.show()
club = fifa.groupby('Club')['Overall'].mean().reset_index().sort_values('Overall', ascending=True).tail(20)
fig = px.bar(club, x="Overall", y="Club", orientation='h')
fig.show()
player_features = ['Crossing', 'Finishing', 'HeadingAccuracy',
'ShortPassing', 'Volleys', 'Dribbling', 'Curve', 'FKAccuracy',
'LongPassing', 'BallControl', 'Acceleration', 'SprintSpeed',
'Agility', 'Reactions', 'Balance', 'ShotPower', 'Jumping',
'Stamina', 'Strength', 'LongShots', 'Aggression', 'Interceptions',
'Positioning', 'Vision', 'Penalties', 'Composure', 'Marking',
'StandingTackle', 'SlidingTackle', 'GKDiving', 'GKHandling',
'GKKicking', 'GKPositioning', 'GKReflexes']
df_postion = pd.DataFrame()
for position_name, features in fifa.groupby(fifa['Position'])[player_features].mean().iterrows():
top_features = dict(features.nlargest(5))
df_postion[position_name] = tuple(top_features)
df_postion.head()
position = []
player = []
club_l = []
for col in df_postion.columns:
tmp_df = pd.DataFrame()
l = [df_postion[col].values]
l = l[0]
l = list(l)
l.append('Name')
tmp_df = pd.DataFrame.copy(fifa[fifa['Position'] == col][l])
tmp_df['mean'] = np.mean(tmp_df.iloc[: , :-1] , axis = 1)
name = tmp_df['Name'][tmp_df['mean'] == tmp_df['mean'].max()].values[0]
club = fifa['Club'][fifa['Name'] == str(name)].values[0]
position.append(col)
player.append(name)
club_l.append(club)
gk = ['GK']
forward = ['LS', 'ST', 'RS','LF', 'CF', 'RF']
midfeilder = ['LW','RW', 'LAM', 'CAM', 'RAM', 'LM', 'LCM', 'CM',
'RCM', 'RM', 'LDM', 'CDM', 'RDM' ]
defenders = ['LWB','RWB', 'LB', 'LCB', 'CB',]
print('GoalKeeper : ')
for p , n , c in zip(position , player , club_l):
if p in gk:
print('{} [Club : {} , Position : {}]'.format(n , c , p))
print('\nFORWARD : ')
for p , n , c in zip(position , player , club_l):
if p in forward:
print('{} [Club : {} , Position : {}]'.format(n , c , p))
print('\nMIDFEILDER : ')
for p , n , c in zip(position , player , club_l):
if p in midfeilder:
print('{} [Club : {} , Position : {}]'.format(n , c , p))
print('\nDEFENDER : ')
for p , n , c in zip(position , player , club_l):
if p in defenders:
print('{} [Club : {} , Position : {}]'.format(n , c , p))
CAM = 'H. Nakagawa'
CB = 'D. GodÃn'
CDM = 'Casemiro'
CF = 'S. Giovinco'
CM = 'N. Keïta'
GK = 'De Gea'
LAM = 'Paulo Daineiro'
LB = 'Jordi Alba'
LCB = 'G. Chiellini'
LCM = 'David Silva'
LDM = 'N. Kanté'
LF = 'E. Hazard'
LM = 'Douglas Costa'
LS = 'J. MartÃnez'
LW = 'Neymar Jr'
LWB = 'M. Pedersen'
RAM = 'J. Cuadrado'
RB = 'Nélson Semedo'
RCB = 'Sergio Ramos'
RCM = 'L. Modrić'
RDM = 'P. Pogba'
RF = 'L. Messi'
RM = 'Gelson Martins'
RS = 'A. Saint-Maximin'
RW = 'R. Sterling'
RWB = 'M. Millar'
ST = 'Cristiano Ronaldo'
def create_football_formation(formation = [] , label_1 = None ,
label_2 = None , label_3 = None ,
label_4 = None,label_4W = None ,
label_5 = None , label_3W = None):
plt.scatter(x = [1] , y = [6] , s = 300 , color = 'blue')
plt.annotate('De Gea \n(Manchester United)' , (1 - 0.5 , 6 + 0.5))
plt.plot(np.ones((11 , ))*1.5 , np.arange(1 , 12) , 'w-')
plt.plot(np.ones((5 , ))*0.5 , np.arange(4 , 9) , 'w-')
n = 0
for posi in formation:
if posi == 1:
n += 3
dot = plt.scatter(x = [n] , y = [6] , s = 400 , color = 'white')
plt.scatter(x = [n] , y = [6] , s = 300 , color = 'red')
for i, txt in enumerate(label_1):
txt = str(txt+'\n('+fifa['Club'][fifa['Name'] == txt].values[0]+')')
plt.annotate(txt, ( n-0.5 , 6+0.5))
elif posi == 2:
n += 3
y = [5 , 7.5]
x = [ n , n ]
plt.scatter(x , y , s = 400 , color = 'white')
plt.scatter(x , y , s = 300 , color = 'red')
for i, txt in enumerate(label_2):
txt = str(txt+'\n('+fifa['Club'][fifa['Name'] == txt].values[0]+')')
plt.annotate(txt, (x[i] - 0.5, y[i]+0.5))
elif posi == 3:
n+=3
y = [3.333 , 6.666 , 9.999]
x = [n , n , n ]
plt.scatter(x , y , s = 400 , color = 'white')
plt.scatter(x , y , s = 300 , color = 'red')
for i, txt in enumerate(label_3):
txt = str(txt+'\n('+fifa['Club'][fifa['Name'] == txt].values[0]+')')
plt.annotate(txt, (x[i] - 0.5, y[i]+0.5))
if not label_3W == None:
n+=3
y = [3.333 , 6.666 , 9.999]
x = [n , n , n ]
plt.scatter(x , y , s = 400 , color = 'white')
plt.scatter(x , y , s = 300 , color = 'red')
for i, txt in enumerate(label_3W):
txt = str(txt+'\n('+fifa['Club'][fifa['Name'] == txt].values[0]+')')
plt.annotate(txt, (x[i] - 0.5, y[i]+0.5))
elif posi == 4 and not label_4 == None:
n+=3
y = [2.5 , 5 , 7.5 , 10]
x = [n , n , n , n ]
plt.scatter(x , y , s = 400 , color = 'white')
plt.scatter(x , y , s = 300 , color = 'red')
for i, txt in enumerate(label_4):
txt = str(txt+'\n('+fifa['Club'][fifa['Name'] == txt].values[0]+')')
plt.annotate(txt, (x[i] - 0.5, y[i]+0.5))
if not label_4W == None:
n+=3
y = [2.5 , 5 , 7.5 , 10]
x = [n , n , n , n ]
plt.scatter(x , y , s = 400 , color = 'white')
plt.scatter(x , y , s = 300 , color = 'red')
for i, txt in enumerate(label_4W):
txt = str(txt+'\n('+fifa['Club'][fifa['Name'] == txt].values[0]+')')
plt.annotate(txt, (x[i] - 0.5, y[i]+0.5))
elif posi == 5:
n+=3
y = [2 , 4 , 6 , 8 , 10]
x = [n , n , n , n , n]
plt.scatter(x , y , s = 400 , color = 'white')
plt.scatter(x , y , s = 300 , color = 'red')
for i, txt in enumerate(label_5):
txt = str(txt+'\n('+fifa['Club'][fifa['Name'] == txt].values[0]+')')
plt.annotate(txt, (x[i] - 0.5, y[i]+0.5))
plt.plot(np.ones((5 , ))*(n+0.5) , np.arange(4 , 9) , 'w-')
plt.plot(np.ones((11 , ))*(n/2) , np.arange(1 , 12) , 'w-')
plt.yticks([])
plt.xticks([])
ax = plt.gca()
ax.set_facecolor('#28fc03')
plt.figure(1 , figsize = (15 , 7))
create_football_formation(formation = [ 4 , 2 ] ,
label_4 = [LWB , LCB , RCB , RWB],
label_4W = [LW , LCM , CM , RW],
label_2 = [LF , RF],
)
plt.title('Best Fit for formation 4-4-2')
plt.show()
plt.figure(1 , figsize = (15 , 7))
create_football_formation(formation = [ 4 , 2 ] ,
label_4 = [LB , CB , RCB , RB],
label_4W = [LAM , LDM , RDM , RAM],
label_2 = [LS , RS],
)
plt.title('OR\nBest Fit for formation 4-4-2')
plt.show()
plt.figure(1 , figsize = (15 , 7))
create_football_formation(formation = [ 4 , 2 ] ,
label_4 = [LB , CB , RCB , RB],
label_4W = [LW , LDM , RDM , RW],
label_2 = [CF , ST],
)
plt.title('OR\nBest Fit for formation 4-4-2')
plt.show()
plt.figure(1 , figsize = (15 , 7))
create_football_formation(formation = [ 4 , 2 ] ,
label_4 = [LB , CB , RCB , RB],
label_4W = [LW , LCM , RCM , RW],
label_2 = [CF , ST],
)
plt.title('OR\nBest Fit for formation 4-4-2')
plt.show()
plt.figure(1 , figsize = (15 , 7))
create_football_formation(formation = [ 4 , 2 ] ,
label_4 = [LWB , LCB , RCB , RWB],
label_4W = [LW , LCM , CM , RW],
label_2 = [LF , RF],
)
plt.title('OR\nBest Fit for formation 4-4-2')
plt.show()
plt.figure(1 , figsize = (15 , 7))
create_football_formation(formation = [ 4 , 2 , 3 , 1] ,
label_4 = [LWB , LCB , RCB , RWB],
label_2 = [LCM , RCM],
label_3 = [LF , CAM , RF],
label_1 = [ST])
plt.title('Best Fit for formation 4-2-3-1')
plt.show()
plt.figure(1 , figsize = (15 , 7))
create_football_formation(formation = [ 4 , 2 , 3 , 1] ,
label_4 = [LWB , LB , RB , RWB],
label_2 = [LAM , RAM],
label_3 = [LW , CF , RW],
label_1 = [ST])
plt.title('OR\nBest Fit for formation 4-2-3-1')
plt.show()
plt.figure(1 , figsize = (15 , 7))
create_football_formation(formation = [ 4 , 2 , 3 , 1] ,
label_4 = [LWB , CB , RCB , RWB],
label_2 = [CM , CAM],
label_3 = [LF , CM , RF],
label_1 = [ST])
plt.title('OR\nBest Fit for formation 4-2-3-1')
plt.show()
plt.figure(1 , figsize = (15 , 7))
create_football_formation(formation = [ 4 , 2 , 3 , 1] ,
label_4 = [LWB , LCB , RCB , RWB],
label_2 = [LCM , RCM],
label_3 = [LDM , CAM , RDM],
label_1 = [ST])
plt.title('OR\nBest Fit for formation 4-2-3-1')
plt.show()
plt.figure(1 , figsize = (15 , 7))
create_football_formation(formation = [ 5, 4 , 1 ] ,
label_5 = [LWB , LCB , CB , RCB , RWB],
label_4 = [LW, LDM , RDM , RW],
label_1 = [ST])
plt.title('Best Fit for formation 5-4-1')
plt.show()
plt.figure(1 , figsize = (15 , 7))
create_football_formation(formation = [ 4 , 3 ] ,
label_4 = [LWB , LCB , RCB , RWB],
label_3 = [LW, CAM , RW],
label_3W = [LF , ST , RF])
plt.title('Best Fit for formation 4-3-3')
plt.show()
plt.figure(1 , figsize = (15 , 7))
create_football_formation(formation = [ 4 , 3 ] ,
label_4 = [LWB , CB , RB , RWB],
label_3 = [LAM, CM , RAM],
label_3W = [LS , CF , RS])
plt.title('OR\nBest Fit for formation 4-3-3')
plt.show()
plt.figure(1 , figsize = (15 , 7))
create_football_formation(formation = [ 4 , 3 ] ,
label_4 = [LB , LCB , RCB , RB],
label_3 = [LDM, CDM , RDM],
label_3W = [LF , CF , RF])
plt.title('OR\nBest Fit for formation 4-3-3')
plt.show()
plt.figure(1 , figsize = (15 , 7))
create_football_formation(formation = [ 4 , 3] ,
label_4 = [LWB , CB , RB , RWB],
label_3 = [LAM, CAM , RAM],
label_3W = [LS , ST , RS])
plt.title('OR\nBest Fit for formation 4-3-3')
plt.show()
plt.figure(1 , figsize = (15 , 7))
create_football_formation(formation = [ 4 , 3] ,
label_4 = [LWB , CB , RB , RWB],
label_3 = [LCM, CAM , RCM],
label_3W = [LF , ST , RF])
plt.title('OR\nBest Fit for formation 4-3-3')
plt.show()
#Get a count of Nationalities in the Dataset, make of list of those with over 250 Players (Major Playing Nations)
nat_counts = fifa["Nationality"].value_counts()
nat_list = nat_counts[nat_counts > 250].index.tolist()
#Replace Nationality with a binary indicator variable for 'Major Nation'
def major_nation(df):
if (df["Nationality"] in nat_list):
return 1
else:
return 0
fifa['Major_Nation'] = fifa.apply(major_nation,axis = 1)
#Turn Preferred Foot into a binary indicator variable
def right_footed(df):
if (df['Preferred Foot'] == 'Right'):
return 1
else:
return 0
fifa['Right_Foot'] = fifa.apply(right_footed, axis=1)
#Create a simplified position varaible to account for all player positions
def simple_position(df):
if (df['Position'] == 'GK'):
return 'GK'
elif ((df['Position'] == 'RB') | (df['Position'] == 'LB') | (df['Position'] == 'CB') | (df['Position'] == 'LCB') | (df['Position'] == 'RCB') | (df['Position'] == 'RWB') | (df['Position'] == 'LWB') ):
return 'DF'
elif ((df['Position'] == 'LDM') | (df['Position'] == 'CDM') | (df['Position'] == 'RDM')):
return 'DM'
elif ((df['Position'] == 'LM') | (df['Position'] == 'LCM') | (df['Position'] == 'CM') | (df['Position'] == 'RCM') | (df['Position'] == 'RM')):
return 'MF'
elif ((df['Position'] == 'LAM') | (df['Position'] == 'CAM') | (df['Position'] == 'RAM') | (df['Position'] == 'LW') | (df['Position'] == 'RW')):
return 'AM'
elif ((df['Position'] == 'RS') | (df['Position'] == 'ST') | (df['Position'] == 'LS') | (df['Position'] == 'CF') | (df['Position'] == 'LF') | (df['Position'] == 'RF')):
return 'ST'
else:
return df.Position
fifa['Simple_Position'] = fifa.apply(simple_position,axis = 1)
#Split the Work Rate Column in two
tempwork = fifa["Work Rate"].str.split("/ ", n = 1, expand = True)
#Create new column for first work rate
fifa["WorkRate1"]= tempwork[0]
#Create new column for second work rate
fifa["WorkRate2"]= tempwork[1]
# Drop Columns
fifa.drop(['Name','Nationality','Club','Wage','Preferred Foot','Work Rate','Position','Joined',
'Contract Valid Until','Release Clause','year'],axis=1,inplace=True)
fifa.head()
fifa['Body Type'].unique()
body_type= {
'Lean':0,
'Normal':1,
'Stocky':2
}
fifa['Body Type'] = fifa['Body Type'].map(body_type)
fifa['Simple_Position'].unique()
simple_position = {
'ST':0,
'AM':1,
'GK':2,
'MF':3,
'DF':4,
'DM':5,
'unknown':6
}
fifa['Simple_Position'] = fifa['Simple_Position'].map(simple_position)
fifa['WorkRate1'].unique()
fifa['WorkRate2'].unique()
work_rate ={
'Medium':0,
'High':1,
'Low':2
}
fifa['WorkRate1'] = fifa['WorkRate1'].map(work_rate)
fifa['WorkRate2'] = fifa['WorkRate2'].map(work_rate)
#dropping the converted columns
fifa.drop(['Body Type','Simple_Position', 'WorkRate1','WorkRate2'],axis=1, inplace=True)
fifa.head()
fifa.columns
X = fifa[['Age','Overall', 'Potential','Special','International Reputation', 'Weak Foot',
'Skill Moves', 'Height', 'Weight','Crossing', 'Finishing', 'HeadingAccuracy',
'ShortPassing', 'Volleys','Dribbling', 'Curve', 'FKAccuracy', 'LongPassing',
'BallControl','Acceleration', 'SprintSpeed', 'Agility', 'Reactions', 'Balance',
'ShotPower', 'Jumping', 'Stamina', 'Strength', 'LongShots',
'Aggression', 'Interceptions', 'Positioning', 'Vision', 'Penalties',
'Composure', 'Marking', 'StandingTackle', 'SlidingTackle', 'GKDiving',
'GKHandling', 'GKKicking', 'GKPositioning', 'GKReflexes',
'Major_Nation', 'Right_Foot']]
y = np.log1p(fifa['Value'])
sns.distplot(y,kde=False)
y_index = y[y>7].index
X = X.loc[y_index]
y = y[y>7]
print(X.shape, y.shape)
X.head()
X.shape
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso
from sklearn.preprocessing import MinMaxScaler
X_norm = MinMaxScaler().fit_transform(X)
sel_ = SelectFromModel(Lasso(alpha=.0001, random_state=42))
sel_.fit(X_norm, y)
sel_.get_support()
selected_feat = X.columns[(sel_.get_support())]
# let's print some stats
print('total features: {}'.format((X.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('features with coefficients shrank to zero: {}'.format(
np.sum(sel_.estimator_.coef_ == 0)))
selected_feat
X_model = X[selected_feat]
X_model.shape
#Splitting into test and train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_model, y, test_size=0.3, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
# define some handy analysis support function
from sklearn.metrics import mean_squared_error, explained_variance_score, r2_score
def plot_prediction_analysis(y, y_pred, figsize=(10,4), title=''):
fig, axs = plt.subplots(1, 2, figsize=figsize)
axs[0].scatter(y, y_pred)
mn = min(np.min(y), np.min(y_pred))
mx = max(np.max(y), np.max(y_pred))
axs[0].plot([mn, mx], [mn, mx], c='red')
axs[0].set_xlabel('$y$')
axs[0].set_ylabel('$\hat{y}$')
rmse = np.sqrt(mean_squared_error(y, y_pred))
evs = explained_variance_score(y, y_pred)
r2 = r2_score(y, y_pred)
axs[0].set_title('rmse = {:.2f}, evs = {:.2f}, r2 = {:,.2f}'.format(rmse, evs, r2))
axs[1].hist(y-y_pred, bins=50)
avg = np.mean(y-y_pred)
std = np.std(y-y_pred)
axs[1].set_xlabel('$y - \hat{y}$')
axs[1].set_title('Histrogram prediction error, $\mu$ = {:.2f}, $\sigma$ = {:.2f}'.format(avg, std))
if title!='':
fig.suptitle(title)
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
#model = LinearRegression()
model = Pipeline((
("standard_scaler", StandardScaler()),
("poly", PolynomialFeatures(degree=2)),
("lin_reg", Lasso(alpha=0.01))
))
model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)
plot_prediction_analysis(y_train, y_train_pred, title='Polynomial Model - Trainingset')
y_test_pred = model.predict(X_test)
plot_prediction_analysis(y_test, y_test_pred, title='Polynomial Model - Testset')